#Class 04/05
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ readr   2.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
f <- "https://raw.githubusercontent.com/difiore/ada-2022-datasets/main/KamilarAndCooperData.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 213 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Scientific_Name, Family, Genus, Species, Brain_size_Ref, Mass_Ref,...
## dbl (28): Brain_Size_Species_Mean, Brain_Size_Female_Mean, Body_mass_male_me...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
g<-d[!is.na(d$Body_mass_female_mean),]
damend<-g[!is.na(g$MaxLongevity_m),]
plot(damend$MaxLongevity_m~damend$Body_mass_female_mean)

##When plotted, the data is not linear. It is very heavily skewed to the left. We could log transform the data and see if that makes it appear more linear. 
log_maxlong<-log(damend$MaxLongevity_m)
log_massfem<- log(damend$Body_mass_female_mean)
plot(log_maxlong~log_massfem)

#plotting with log transform makes the data more linear, 
# This is the Female mass vs Longevity
library(lmodel2)
m1<-lm(MaxLongevity_m~Body_mass_female_mean, data=damend)
m1
## 
## Call:
## lm(formula = MaxLongevity_m ~ Body_mass_female_mean, data = damend)
## 
## Coefficients:
##           (Intercept)  Body_mass_female_mean  
##             283.47213                0.00905
resid_m1<-resid(m1)
hist(resid_m1)

g<-d[!is.na(d$Body_mass_female_mean),]
damend<-g[!is.na(g$MaxLongevity_m),]
plot(resid_m1~damend$Body_mass_female_mean)

## This histogram appears to be right skewed, but relatively normally distributed.

#Plotting the Log of Female mass vs Lifespan  
m2<- lm(damend$MaxLongevity_m~log_massfem)
m2
## 
## Call:
## lm(formula = damend$MaxLongevity_m ~ log_massfem)
## 
## Coefficients:
## (Intercept)  log_massfem  
##     -128.86        59.34
resid_m2<-resid(m2)
hist(resid_m2)

plot(resid_m2~log_massfem)

##Still right skewed but more normally distributed than the data without being log transformed. 

#Plotting the Log of Female mass vs the log of Lifespan  
m3<- lm(log_maxlong~log_massfem)
m3
## 
## Call:
## lm(formula = log_maxlong ~ log_massfem)
## 
## Coefficients:
## (Intercept)  log_massfem  
##       4.259        0.190
Resid_m3<-resid(m3)
hist(Resid_m3)

plot(Resid_m3~log_massfem)

## I think the Log of Female mass vs Lifespan was more normally distributed than both of the variables being transformed. When both of the variables are transformed, the histogram looks to be normally distributed with a slight left-skew. 

qqnorm(residuals(m1))
qqline(residuals(m1))

qqnorm(residuals(m2))
qqline(residuals(m2))

qqnorm(residuals(m3))
qqline(residuals(m3))

## In the QQplot, the data where both varibles are transformed looks to be the most normally distributed, as it lies the closest to the diagonal line. All datasets look to be relatively normally distributed. 

plot(m1)

plot(m2)

plot(m3)

## All the plots generated in this step give us insight on the linearity of the data and if there are certain points that have a lot of influence on the regression model. It looks like m2, or the model where Female Mass is transformed hasthe least amount of points that heavily skew or lie outside of the predictors for the regression model selection.

shapiro.test(m1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  m1$residuals
## W = 0.96352, p-value = 0.0008238
shapiro.test(m2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  m2$residuals
## W = 0.97515, p-value = 0.01131
shapiro.test(m3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  m3$residuals
## W = 0.99319, p-value = 0.7403
## The Shapiro-Wilks test gives us descriptive stats on the normality of the data. In the shapiro wilks test, if the value of p is less than or equal to 0.05, we can conclude it is normally distributed. In this case the untransformed data and the model where Female mass are transformed are the only two models that display normality according to Shapiro Wilks.